log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")
knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width = 10, fig.height=5)
require(ggplot2)
## Loading required package: ggplot2
require(data.table)
## Loading required package: data.table
require(lme4)
## Loading required package: lme4
## Loading required package: Matrix
require(lmerTest)
## Loading required package: lmerTest
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
require(lqmm)
## Loading required package: lqmm
require(car)
## Loading required package: car
## Loading required package: carData
require(cowplot)
## Loading required package: cowplot
require(DT)
## Loading required package: DT
require(pheatmap)
## Loading required package: pheatmap
require(limma)
## Loading required package: limma
require(cluster)
## Loading required package: cluster
meta <- fread(snakemake@input[["meta"]])
meta[,PatientID_Time := paste(PatientID, Time_seq, sep ="_")]
meta <- meta[Time_seq != -1,]
clinic <- fread(snakemake@input[["clinic"]], drop=1)
meta <- merge(meta,clinic, by="PatientID_Time", all.x=TRUE, suffix = c(".meta",""))
#meta_merge <- merge(meta,clinic, all.x=TRUE)
subSys <- fread(snakemake@input[["subSysCount"]])
subSysCount <- as.matrix(subSys[,-1, with =F])
rownames(subSysCount) <- gsub(" ","_",unlist(subSys[,1]))
rxns <- fread(snakemake@input[["rxnCount"]])
rxnCount <- as.matrix(rxns[,-1,with=F ])
rownames(rxnCount) <- gsub(" ","_",unlist(rxns[,1]))
subSystems <- fread(snakemake@input[["subsysTable"]])
GPR <- fread(snakemake@input[["GPR"]])
# remove degraded samples
blcklst <- fread(snakemake@input[["sampleBlacklist"]])
blckSmpls <- blcklst[relevance < 2, Sample]
meta <- meta[!(SeqID %in% blckSmpls),]
subSysCount <- subSysCount[,!(colnames(rxnCount) %in% blckSmpls)]
rxnCount <- rxnCount[,!(colnames(rxnCount) %in% blckSmpls)]
meta <- meta[SeqID %in% colnames(rxnCount),]
lqmmModels <- fread(snakemake@input[["lqmmModels"]])
lmmModels <- fread(snakemake@input[["lmmModels"]])
rxnDistStats <- fread(snakemake@input[["rxn_stats"]])
fva <- readRDS(snakemake@input[["fva"]])
distTar <- snakemake@input[["distFiles"]]
# than we can still use a simple wilcox test for each reaction (present/absence) to test association with HB_Mayo
trxnCount <- data.table(t(rxnCount), keep.rownames=TRUE)
rxn_wilcox <- merge(meta[Tissue == "Biopsy" & !is.na(HB_Mayo_impu),],
trxnCount,
by.x ="SeqID",
by.y="rn")
rxn <- rownames(rxnCount)[apply(rxnCount,1,sum)>0]
#wcxTest <- data.frame(
# log2FC = rep(NA, length(rxn)),
# pval = rep(NA,length(rxn)),
# padj = rep(NA,length(rxn)),
# row.names = rxn)
#for(r in rxn){
# sel <- rxn_wilcox[[r]]
# if(sum(sel) < length(sel)-3 & sum(sel) > 3){
# wcx <- wilcox.test(rxn_wilcox[sel, HB_Mayo_impu],
# rxn_wilcox[!sel,HB_Mayo_impu])
# wcxTest[r,"pval"] <- wcx[["p.value"]]
# wcxTest[r,"log2FC"] <- mean(log2(rxn_wilcox[sel, HB_Mayo_impu]+1)) -
# mean(log2(rxn_wilcox[!sel,HB_Mayo_impu]+1))
# }
#}
#wcxTest[["padj"]] <- p.adjust(wcxTest[["pval"]], method = "BH")
#wcxTest <- data.table(wcxTest,keep.rownames=TRUE)
#wcxTest[,absLog2FC := abs(log2FC)]
#wcxTest[,rxn_plot := factor(rn, level = rn[order(log2FC)])]
models <- list()
i <- 0
require(doMC)
require(parallel)
require(foreach)
registerDoMC(cores = detectCores()-1)
#for(r in rxn){
# i<-i+1
# sel <- rxn_wilcox[[r]]
# if(sum(sel) < length(sel)-3 & sum(sel) > 3){
# mod <- lmer(data=rxn_wilcox, HB_Mayo_impu ~ sel + (1|PatientID))
# sumMod <- summary(mod)[["coefficients"]]
# models[[r]] <- cbind(rxn = r, coefficient =rownames(sumMod), as.data.frame(sumMod))
# }
# current <- round(i/length(rxn), 2)
# cat(paste0(current," \r"))
#}
models <- foreach(r = rxn) %dopar% {
i<-i+1
sel <- rxn_wilcox[[r]]
if(sum(sel) < length(sel)-3 & sum(sel) > 3){
mod <- lmer(data=rxn_wilcox, HB_Mayo_impu ~ sel + (1|PatientID))
sumMod <- summary(mod)[["coefficients"]]
cbind(rxn = r, coefficient =rownames(sumMod), as.data.frame(sumMod))
}
}
names(models) <- rxn
sel <- which(!sapply(models,is.null))
models <- models[sel]
mm <- data.table(do.call(rbind, models))
mm[, padj := p.adjust(`Pr(>|t|)`, method = "BH"), by = coefficient]
mm[coefficient != "(Intercept)" & padj < 0.05, ]
## rxn coefficient Estimate Std. Error df t value
## 1: 5HLTDL selTRUE -1.670913 0.4636078 266.7558 -3.604152
## 2: ACCOACm selTRUE -1.215800 0.4472121 246.9083 -2.718621
## 3: ACOAD10m selTRUE -1.103716 0.3356784 237.8064 -3.288017
## 4: ACOAD8m selTRUE -1.084178 0.3379266 238.3950 -3.208325
## 5: ALOX5 selTRUE 1.468230 0.4053379 255.1258 3.622237
## ---
## 432: EX_HC00250(u) selTRUE 1.385959 0.3388535 236.7123 4.090141
## 433: EX_prostgi2(u) selTRUE 1.302116 0.4847191 238.6614 2.686331
## 434: EX_C02470(u) selTRUE -2.016764 0.3696785 257.7699 -5.455454
## 435: EX_3ivcrn(u) selTRUE 1.080751 0.3169071 236.1712 3.410307
## 436: EX_c51crn(u) selTRUE -1.246189 0.3390505 231.9877 -3.675525
## Pr(>|t|) padj
## 1: 3.735852e-04 5.982231e-03
## 2: 7.020400e-03 4.577367e-02
## 3: 1.161637e-03 1.291853e-02
## 4: 1.518293e-03 1.568334e-02
## 5: 3.523338e-04 5.749921e-03
## ---
## 432: 5.911977e-05 1.244674e-03
## 433: 7.731739e-03 4.913395e-02
## 434: 1.145226e-07 3.096602e-06
## 435: 7.630651e-04 9.071266e-03
## 436: 2.948818e-04 4.972914e-03
# creating a matrix for a venn diagram
vennMat <- matrix(FALSE,nrow=nrow(subSystems), ncol = 4, dimnames = list(rxns = subSystems[["rxns"]], test = c("presence","range_LQMM","range_LMM","distance")))
#vennMat[wcxTest[padj <= 0.05, rn], "presence"] <- TRUE
vennMat[mm[padj <= 0.05 & coefficient != "(Intercept)", rxn], "presence"] <- TRUE
vennMat[lqmmModels[padj <= 0.05, variable], "range_LQMM"] <- TRUE
vennMat[rxnDistStats[padj <= 0.05 & Variable == "HB_Mayo_impu", Rxn], "distance"] <- TRUE
vennMat[lmmModels[padj <= 0.05, variable], "range_LMM"] <- TRUE
# plotting the venn diagram
limma::vennDiagram(vennMat)
# looking into the middle of it
vennSig <- vennMat[apply(vennMat,1,sum) == ncol(vennMat),]
vennSig <- data.frame(vennSig)
setkey(subSystems, "rxns")
vennSig[["Subsystems"]] <- subSystems[rownames(vennSig),subSys]
# get the coefficients for the lmms
rxns <- rownames(vennSig)[vennSig[,"range_LMM"]]
fva_range <- fva[rxns,,"range"]
fva_range <- apply(fva_range, 2,scale)
rownames(fva_range) <- rxns
fva_range <- data.table(data.frame(t(fva_range)), keep.rownames=TRUE)
fva_range <- merge(fva_range, meta[Tissue =="Biopsy",], by.x = "rn", by.y = "SeqID")
setkey(lmmModels, "variable")
lmmModels[,coefficient := 0]
coefs <- foreach(reac = rxns) %dopar%
#for(reac in rxns)
{
if(strsplit(reac,split = "")[[1]][1] %in% as.character(0:9)){
reac_x <- gsub("\\((.*)\\)",".\\1.",paste0("X",reac))
} else {
reac_x <-gsub("\\((.*)\\)",".\\1.",reac)
}
fom <- as.formula(paste(reac_x, "~ HB_Mayo_impu + (1|PatientID)"))
mod <- lmer(data = fva_range, formula = fom)
fixef(mod)[2]
}
lmmModels[rxns, coefficient := unlist(coefs)]
# add the metrics
setkey(mm,"rxn")
vennSig[["Presence_Coef"]] <- mm[coefficient != "(Intercept)",][rownames(vennSig), Estimate]
setkey(lqmmModels,"variable")
vennSig[["Range_coeff_LQMM"]] <- as.numeric(lqmmModels[rownames(vennSig),coefficient])
setkey(rxnDistStats,"Rxn")
vennSig[["Dist_R2"]] <- rxnDistStats[rownames(vennSig),R2]
setkey(lmmModels, "variable")
vennSig[["Range_coeff_LMM"]]<-as.numeric(lmmModels[rownames(vennSig),coefficient])
# plot the wilcoxon fold change
pp_log2FC <-ggplot(data = vennSig, aes(y = Subsystems, x = Presence_Coef, color = Subsystems)) +
geom_vline(xintercept = 0,linetype=2)+
geom_jitter(height=0.25, width = 0,size=2) +
theme_bw() +
theme(legend.position = "none")
# plot the lqmm coefficient
pp_coeff<-ggplot(data = vennSig, aes(y = Subsystems, x = as.numeric(Range_coeff_LQMM),color = Subsystems)) +
geom_vline(xintercept = 0,linetype=2)+
geom_jitter(height=0.25, width = 0,size=2)+
xlab("Coefficient of LQMM model")+
theme_bw() +
theme(legend.position = "none",
axis.text.y = element_blank(),
axis.title.y = element_blank())
# plot the lmm coefficient
pp_coeff_LMM<-ggplot(data = vennSig, aes(y = Subsystems, x = as.numeric(Range_coeff_LMM),color = Subsystems)) +
geom_vline(xintercept = 0,linetype=2)+
geom_jitter(height=0.25, width = 0,size=2)+
xlab("Coefficient of LMM model")+
theme_bw() +
theme(legend.position = "none",
axis.text.y = element_blank(),
axis.title.y = element_blank())
# plot the variance in distance explained by the variable tested
pp_R2 <-ggplot(data = vennSig, aes(y = Subsystems, x = Dist_R2,color =Subsystems)) +
geom_jitter(height=0.25, width = 0,size=2)+
xlab("Distance variance explained by HB/Mayo")+
theme_bw() +
theme(legend.position = "none",
axis.text.y = element_blank(),
axis.title.y = element_blank())
pp <- cowplot::plot_grid(pp_log2FC,pp_coeff,pp_coeff_LMM,pp_R2,ncol=4,rel_widths = c(2,1,1,1))
pp
pp_log2FCbar <-ggplot(data = vennSig[order(vennSig[["Presence_Coef"]],vennSig[["Subsystems"]]),], aes(x=1:nrow(vennSig),y = Presence_Coef, fill = Subsystems)) +
geom_bar(stat="identity")+
theme_bw() +
xlab("Rxns")+
ylab("Fold change of HB/Mayo score on presence of rxn")+
theme(axis.text.x = element_blank())
vennSig2 <- merge(data.table(vennSig, keep.rownames =TRUE), GPR, by.x="rn",by.y="rxn",keep.x=TRUE)
# function to count the genes in a string
countGPRgenes <- function(pattern, text){
# @text - a vector of character strings to look for the pattern
# @pattern - a pattern to look for in the text
# returns - a list of the same length as text with the number of unique patterns
result <- list()
cnt <- 0
for(t in text){
cnt <- cnt+1
srch <- gregexpr(pattern, t)[[1]]
res <- c()
for(i in 1:length(srch)){
srch_len <- attributes(srch)[["match.length"]][i]
res<- c(res,substr(t,srch[i],srch[i]+srch_len-1))
}
result[[cnt]] <- unique(res)
}
return(result)
}
vennPlot <- vennSig2[,.(numberRxn = .N,
mean_log2FC = mean(Presence_Coef),
mean_coeff_lqmm= mean(Range_coeff_LQMM),
mean_coeff_lmm= mean(Range_coeff_LMM),
mean_R2 = mean(as.numeric(Dist_R2))),
by = Subsystems]
# extract the number of genes involved
ngenes <- sapply(countGPRgenes("HGNC:\\d+",unlist(vennSig2[,.(n_genes = gsub("\\s","",paste(GPR,collapse=","))),by="Subsystems"][,2])), length)
vennPlot[,n_genes := ngenes]
pp <- ggplot(data = vennPlot, aes(y=mean_coeff_lmm, x = mean_log2FC, size = mean_R2, color = Subsystems)) +
geom_point()
pp
pp_genesbar <- ggplot(data = vennPlot, aes(x=Subsystems, y=n_genes, fill = Subsystems)) +
geom_bar(stat="identity") +
ylab("Number of genes involved")+
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_blank())
pp_inset <- ggdraw() +
draw_plot(pp_log2FCbar) +
draw_plot(pp_genesbar, x=0.40,y=0.07, width=0.3,height=0.3)
pp_inset
This is the list of significant genes from all analysis
datatable(vennSig2)
Having a list of genes, we can now look into the relation of the ranges to the HB/Mayo score.
sig_rxn <- vennSig2[,rn]
fva_min <- reshape2::melt(fva[sig_rxn,,"min"])
fva_max <- reshape2::melt(fva[sig_rxn,,"max"])
fva_range <-reshape2::melt(fva[sig_rxn,,"range"])
fva_center <-reshape2::melt(fva[sig_rxn,,"center"])
fva_dat <- merge(fva_min,fva_max,suffix = c(".min",".max"),by=c("rxn","sample"))
fva_dat <- merge(fva_dat,fva_center,by=c("rxn","sample"))
fva_dat <- merge(fva_dat,fva_range,suffix = c(".center",".range"),by=c("rxn","sample"))
fva_dat <- reshape2::melt(fva_dat, id.vals = paste0("value.",dimnames(fva)[3]))
fva_dat[["variable"]] <- gsub("^value\\.","",fva_dat[["variable"]])
fva_dat <- merge(fva_dat, meta, by.x="sample",by.y="SeqID")
fva_dat <- merge(fva_dat, vennSig2, by.x="rxn",by.y="rn")
fva_dat <- data.table(fva_dat)
# select only those reaction which are not based on the same GPR
rxnSelection <- vennSig2[!duplicated(GPR),rn]
rdr <- vennSig2[,rn][order(vennSig2[,Subsystems])]
rdr <- split(rdr, ceiling(seq_along(rdr)/20))
# dotplots
for(rxnSelection in rdr){
fva_plot <- fva_dat[rxn %in% rxnSelection,]
pp <- ggplot(fva_plot[variable %in% c("min","max"),], aes(x=HB_Mayo_impu, y=value, color=variable)) +
geom_jitter(height= 0, width=0.25,alpha=0.5) +
geom_smooth()+
facet_wrap(~rxn, scale ="free")
print(pp)
}
Using only the significant reactions we can try to cluster the data once more.
distFiles <- untar(distTar, list=TRUE)
f <- sapply(paste0("/",vennSig2[,rn],".csv"),
grep,
x = distFiles,
value=TRUE,
fixed=TRUE)
tmpd <- file.path(tempdir(),"untar")
dir.create(tmpd)
untar(distTar, files = unlist(f),exdir = tmpd)
distMat <- lapply(file.path(tmpd,unlist(f)),read.csv,row.names=1)
distMat <- simplify2array(lapply(distMat,as.matrix))
dimnames(distMat)[[3]] <- vennSig2[,rn]
unlink(tmpd, recursive=TRUE)
distAll <- apply(distMat,1:2,mean)
distscl <- cmdscale(as.dist(distAll))
distscl <- merge(meta,data.table(distscl, keep.rownames=TRUE), by.x="SeqID", by.y="rn")
pp <- ggplot(distscl, aes(x=V1,y=V2, color = HB_Mayo_impu<5, shape = Response,label = Response)) +
geom_text() +
facet_wrap(~Diagnosis)
pp
pp <- ggplot(distscl, aes(x=V1,y=V2, color = as.factor(Time_seq), shape = HB_Mayo_impu<5)) +
geom_point(size=3,alpha=0.5)
pp
pp <- ggplot(distscl, aes(x=HB_Mayo_impu, y=V1)) +
geom_point()+
geom_smooth()
pp
pp <- ggplot(distscl, aes(x=HB_Mayo_impu, y=V2)) +
geom_point()+
geom_smooth()
pp
with(distscl,cor.test(HB_Mayo_impu,V1,method="kendall"))
##
## Kendall's rank correlation tau
##
## data: HB_Mayo_impu and V1
## z = 7.6445, p-value = 2.098e-14
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.2195279
with(distscl,cor.test(HB_Mayo_impu,V2,method="kendall"))
##
## Kendall's rank correlation tau
##
## data: HB_Mayo_impu and V2
## z = -0.5743, p-value = 0.5658
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.01649235
distclst <- pam(distAll, k = 2, diss = TRUE)
setkey(distscl, "SeqID")
distscl[names(distclst[["clustering"]]), PAMs := distclst[["clustering"]]]
pp <- ggplot(distscl, aes(x=factor(PAMs), y = HB_Mayo_impu)) +
geom_boxplot()
pp
wilcox.test(data=distscl, HB_Mayo_impu~factor(PAMs))
##
## Wilcoxon rank sum test with continuity correction
##
## data: HB_Mayo_impu by factor(PAMs)
## W = 15744, p-value = 1.509e-11
## alternative hypothesis: true location shift is not equal to 0
# do a plot of all reactions singular
for(i in 1:length(rdr)){
rxns <-rdr[[i]]
distRy <- distMat[,,rxns]
dstscl <- lapply(rxns,function(x) cmdscale(as.dist(distRy[,,x])))
names(dstscl) <- rxns
for(reac in seq_along(dstscl)){
dstscl[[reac]] <- cbind(data.frame(dstscl[[reac]]),rxn = names(dstscl)[reac],SeqID = rownames(dstscl[[reac]]))
}
dstscl <- data.table(do.call(rbind, dstscl))
dstscls <- merge(meta, dstscl, by="SeqID")
pp <- ggplot(dstscls, aes(x=X1,y=X2, color = HB_Mayo_impu<5, shape = Remission)) +
geom_point(size=3,alpha=0.5) +
facet_wrap(~rxn, scale = "free")
print(pp)
pp <- ggplot(dstscls, aes(x=HB_Mayo_impu, y=X1)) +
geom_point()+
geom_smooth()+
facet_wrap(~rxn, scale = "free")
print(pp)
pp <- ggplot(dstscls, aes(x=HB_Mayo_impu, y=X2)) +
geom_point()+
geom_smooth()+
facet_wrap(~rxn, scale = "free")
print(pp)
}